# Clean the environment with the following command
rm(list = ls())
This is an R Markdown Notebook for the Facies Classification Contest. In this notebook, I will try different classification methods that involve decision trees and ensemble classifiers.
This data is from the 2016 ML contest, with the focus for Facies Classification.
The provided data is a CSV file with well logs information from different wells. There are 5 well logs and 2 indicators:
The goal is to train a model able to classify 9 different types of rocks, as listed in the following table:
| Facies | Description | Label | Adjacent Facies |
|---|---|---|---|
| 1 | Nonmarine Sandstone | SS | 2 |
| 2 | Nonmarine coarse siltstone | CSiS | 1,3 |
| 3 | Nonmarine fine siltstone | FSiS | 2 |
| 4 | Marine siltstone and shale | SiSh | 5 |
| 5 | Mudstone | MS | 4,6 |
| 6 | Wackestone | WS | 5,7,8 |
| 7 | Dolomite | D | 6,8 |
| 8 | Packstone-grainstone | PS | 6,7,9 |
| 9 | Phylloid-algal bafflestone | BS | 7,8 |
So, let’s take a look at the data!
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
# Load .csv data to dataframe
data = read.csv("../Data/facies_vectors.csv")
head(data, 10)
Now I will create the list of features and facies names for future use:
features = c('GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS')
well_logs = c('GR','ILD_log10','DeltaPHI','PHIND','PE')
facies_names = c('SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS')
facies_colors = c("1" = '#F4D03F', "2" = '#F5B041', "3" = '#DC7633', "4" = '#6E2C00',
"5" = '#1B4F72', "6" = '#2E86C1', "7" = '#AED6F1', "8" = '#A569BD',
"9" = '#196F3D')
target = "Facies"
testWell = "SHANKLE"
source("Utils.R")
pright <- plotwelllogs(datap = data, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", fcolor = facies_colors, fnames = facies_names)
# Removing well "Recruit F9" from the data:
plotwelllogs(datap = data, wname = "Recruit F9", dz = "Depth", wlogs = well_logs,
facies = "Facies", fcolor = facies_colors, fnames = facies_names)
## Warning: Removed 7 rows containing missing values (geom_path).
dataf = data[data$Well.Name != 'Recruit F9',]
unique(dataf$Well.Name)
## [1] SHRIMPLIN ALEXANDER D SHANKLE LUKE G U
## [5] KIMZEY A CROSS H CATTLE NOLAN NEWBY
## [9] CHURCHMAN BIBLE
## 10 Levels: ALEXANDER D CHURCHMAN BIBLE CROSS H CATTLE ... SHRIMPLIN
Split data into train and test datasets by selecting the well SHANKLE as the blind well for validation:
# For the xgboost package, target must be from 0 to num_class - 1
temp <- dataf
temp$Facies = temp$Facies - 1
# Separating the "blind well"
train = temp[temp$Well.Name != testWell, ]
test = temp[temp$Well.Name == testWell, ]
print(nrow(train))
## [1] 3620
print(nrow(test))
## [1] 449
print(nrow(dataf) - (nrow(train) + nrow(test)))
## [1] 0
For the gradient boosting classifier, I will use the xgboost package. Let’s load the libraries:
library(xgboost, quietly = TRUE, verbose = FALSE)
library(Matrix, quietly = TRUE, verbose = FALSE)
library(archdata, quietly = TRUE, verbose = FALSE)
library(caret, quietly = TRUE, verbose = FALSE)
library(dplyr, quietly = TRUE, verbose = FALSE)
library(Ckmeans.1d.dp, quietly = TRUE, verbose = FALSE)
library(e1071, quietly = TRUE, verbose = FALSE)
library(ggplot2, quietly = TRUE, verbose = FALSE)
library(gridExtra, quietly = TRUE, verbose = FALSE)
library(ggpubr, quietly = TRUE, verbose = FALSE)
library(gplots, quietly = TRUE, verbose = FALSE)
library(reshape2, quietly = TRUE, verbose = FALSE)
library(tidyverse, quietly = TRUE, verbose = FALSE)
library(GGally, quietly = TRUE, verbose = FALSE)
library(signal, quietly = TRUE, verbose = FALSE)
To use the xgboost package, we need to “convert” the training and test datasets to xgb.DMatrix:
# Converting datasets to xgb.DMatrix
dtrain = xgb.DMatrix(data.matrix(train[,features]),
label = data.matrix(train$Facies))
dtest = xgb.DMatrix(data.matrix(test[,features]),
label = data.matrix(test$Facies))
# Calculating number of classes in the data
numberOfClasses <- length(unique(dataf$Facies))
# Setting up parameters
param <- list(max_depth = 6,
eta = 1,
silent = 1,
nthread = 2,
#subsample = 0.5,
objective = "multi:softmax",
eval_metric = "merror",
"num_class" = numberOfClasses)
# Defining training and evaluation sets
watchlist <- list(train = dtrain,
eval = dtest)
# Generating first model using Gradient Boosting Trees
model01 = xgb.train(param,
dtrain,
nrounds = 30,
watchlist)
## [1] train-merror:0.293923 eval-merror:0.616926
## [2] train-merror:0.231768 eval-merror:0.556793
## [3] train-merror:0.188398 eval-merror:0.574610
## [4] train-merror:0.159116 eval-merror:0.563474
## [5] train-merror:0.133702 eval-merror:0.556793
## [6] train-merror:0.106906 eval-merror:0.565702
## [7] train-merror:0.087017 eval-merror:0.561247
## [8] train-merror:0.067956 eval-merror:0.572383
## [9] train-merror:0.052762 eval-merror:0.561247
## [10] train-merror:0.037293 eval-merror:0.559020
## [11] train-merror:0.021823 eval-merror:0.561247
## [12] train-merror:0.018232 eval-merror:0.556793
## [13] train-merror:0.015193 eval-merror:0.554566
## [14] train-merror:0.010221 eval-merror:0.554566
## [15] train-merror:0.009116 eval-merror:0.552339
## [16] train-merror:0.006630 eval-merror:0.547884
## [17] train-merror:0.003867 eval-merror:0.545657
## [18] train-merror:0.003591 eval-merror:0.559020
## [19] train-merror:0.003591 eval-merror:0.559020
## [20] train-merror:0.003039 eval-merror:0.545657
## [21] train-merror:0.002486 eval-merror:0.545657
## [22] train-merror:0.002486 eval-merror:0.543430
## [23] train-merror:0.002486 eval-merror:0.541203
## [24] train-merror:0.002486 eval-merror:0.536748
## [25] train-merror:0.002486 eval-merror:0.534521
## [26] train-merror:0.002486 eval-merror:0.527840
## [27] train-merror:0.002486 eval-merror:0.534521
## [28] train-merror:0.002486 eval-merror:0.530067
## [29] train-merror:0.002486 eval-merror:0.527840
## [30] train-merror:0.002486 eval-merror:0.525612
# Computing prediction estimations for the test data
test_pred = predict(model01,
newdata = data.matrix(test[,features]))
train_pred = predict(model01,
newdata = data.matrix(train[,features]))
test$Predictions = test_pred
# confusion matrix of test set
xtab = createConfusionMatrix(test$Facies + 1,
test$Predictions + 1)
xtab = t(xtab)
rownames(xtab) <- facies_names[1:8]
colnames(xtab) <- facies_names[1:8]
plotConfusionMatrix(xtab, relScale = T)
# Confucion matrix an statistical information
u <- union(test$Predictions + 1, test$Facies + 1)
t <- table(factor(test$Predictions + 1, u), factor(test$Facies + 1, u))
# Confucion matrix and statistical information
CM = confusionMatrix(t,
mode = "everything")
print(CM)
## Confusion Matrix and Statistics
##
##
## 3 2 1 5 8 6 4 7
## 3 65 16 0 0 0 1 0 0
## 2 50 65 82 0 0 0 0 0
## 1 2 8 7 0 0 0 0 0
## 5 0 0 0 1 2 10 0 0
## 8 0 0 0 1 17 12 1 3
## 6 0 0 0 9 19 46 5 2
## 4 0 0 0 7 0 2 1 1
## 7 0 0 0 1 2 0 0 11
##
## Overall Statistics
##
## Accuracy : 0.4744
## 95% CI : (0.4274, 0.5217)
## No Information Rate : 0.2606
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3589
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 2 Class: 1 Class: 5 Class: 8 Class: 6
## Sensitivity 0.5556 0.7303 0.07865 0.052632 0.42500 0.6479
## Specificity 0.9488 0.6333 0.97222 0.972093 0.95844 0.9074
## Pos Pred Value 0.7927 0.3299 0.41176 0.076923 0.50000 0.5679
## Neg Pred Value 0.8583 0.9048 0.81019 0.958716 0.94458 0.9321
## Precision 0.7927 0.3299 0.41176 0.076923 0.50000 0.5679
## Recall 0.5556 0.7303 0.07865 0.052632 0.42500 0.6479
## F1 0.6533 0.4545 0.13208 0.062500 0.45946 0.6053
## Prevalence 0.2606 0.1982 0.19822 0.042316 0.08909 0.1581
## Detection Rate 0.1448 0.1448 0.01559 0.002227 0.03786 0.1024
## Detection Prevalence 0.1826 0.4388 0.03786 0.028953 0.07572 0.1804
## Balanced Accuracy 0.7522 0.6818 0.52544 0.512362 0.69172 0.7776
## Class: 4 Class: 7
## Sensitivity 0.142857 0.64706
## Specificity 0.977376 0.99306
## Pos Pred Value 0.090909 0.78571
## Neg Pred Value 0.986301 0.98621
## Precision 0.090909 0.78571
## Recall 0.142857 0.64706
## F1 0.111111 0.70968
## Prevalence 0.015590 0.03786
## Detection Rate 0.002227 0.02450
## Detection Prevalence 0.024499 0.03118
## Balanced Accuracy 0.560116 0.82006
dtemp = test
dtemp$Predictions <- dtemp$Predictions + 1
dtemp$Facies <- dtemp$Facies + 1
plotwellPred(datap = dtemp, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", classy = "Predictions", fcolor = facies_colors,
fnames = facies_names)
Now, let’s do some analysis of the data. First, let’s evaluate if there are any missing values on each well log and indicators:
A = lapply(features, function(x) cat("Missing values of ", x, ": ",
sum(is.na(dataf[, x])), "\n", sep = ""))
## Missing values of GR: 0
## Missing values of ILD_log10: 0
## Missing values of DeltaPHI: 0
## Missing values of PHIND: 0
## Missing values of PE: 905
## Missing values of NM_M: 0
## Missing values of RELPOS: 0
Okay, there are 905 missing values for “PE”. For now, let’s replace the missing data simply by the “PE” column average:
dataf[is.na(dataf[,"PE"]), "PE"] <- mean(dataf[,"PE"], na.rm = TRUE)
cat("Missing values of PE:", sum(is.na(dataf[, "PE"])))
## Missing values of PE: 0
Now, let’s check all the well logs and facies classification for all the wells. The function plotwelllogs can be loaded from the file Utils.R:
wellname = unique(dataf$Well.Name)
for (ii in wellname){
plotwelllogs(datap = dataf, wname = ii, dz = "Depth", wlogs = well_logs,
facies = "Facies", fcolor = facies_colors, fnames = facies_names)
}
# This function is just to save the legend in a variable to use later
pright <- getLegend(data, "CHURCHMAN BIBLE", "Depth", well_logs, "Facies",
facies_colors, facies_names)
The PE for the wells Alexander D and Kimzey A are, actually, missing. Earlier I replaced the missing values by the average of the whole dataset, but it shows to be a poor solution. Instead, I’ll try now to replace the missing values by a linear regression prediction over the other well logs available. For that, we remove the “damaged” wells from the data, train a linear regression model, and do the predictions on the desired wells.
# Se3lecting list of "good" and "damaged" wells
dwells = c("ALEXANDER D", "KIMZEY A") # "Damaged" wells
gwells = setdiff(wellname, dwells) # Good wells
gwells = setdiff(gwells, testWell) # Good wells
# Creating linear model and making predictions
lmModel <- lm(PE ~ GR + ILD_log10 + DeltaPHI + PHIND,
data = dataf[dataf$Well.Name %in% c(gwells), ])
Let’s check the solution over the test well:
pred = predict.lm(lmModel, dataf[dataf$Well.Name %in% c(testWell), ])
testlm = dataf[dataf$Well.Name %in% testWell, ]
testlm[ , "PE"] <- pred
plotwelllogs(datap = dataf, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", fcolor = facies_colors, fnames = facies_names)
plotwelllogs(datap = testlm, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", fcolor = facies_colors, fnames = facies_names)
Retrain the linear regression with all the good wells:
# Se3lecting list of "good" and "damaged" wells
dwells = c("ALEXANDER D", "KIMZEY A") # "Damaged" wells
gwells = setdiff(wellname, dwells) # Good wells
# Creating linear model and making predictions
lmModel <- lm(PE ~ GR + ILD_log10 + DeltaPHI + PHIND,
data = dataf[dataf$Well.Name %in% c(gwells), ])
Predicting the PE for the wells with missing values:
predictions = predict.lm(lmModel, dataf[dataf$Well.Name %in% c(dwells), ])
datalm <- dataf
datalm[which(datalm$Well.Name %in% c(dwells)), "PE"] <- predictions
cat("Missing values of PE:", sum(is.na(datalm[, "PE"])))
## Missing values of PE: 0
Let’s make the plot for both wells:
for (ii in dwells){
plotwelllogs(datap = datalm, wname = ii, dz = "Depth", wlogs = well_logs,
facies = "Facies", fcolor = facies_colors, fnames = facies_names)
plotwelllogs(datap = dataf, wname = ii, dz = "Depth", wlogs = well_logs,
facies = "Facies", fcolor = facies_colors, fnames = facies_names)
}
Okay, good. Let’s recreate the gradient boosting model with this new dataset. First, after the data imputation, split the data into test and train sets:
# For the xgboost package, target must be from 0 to num_class - 1
temp <- datalm
temp$Facies = temp$Facies - 1
# Splitting data
trainlm = temp[temp$Well.Name != testWell, ]
testlm = temp[temp$Well.Name == testWell, ]
Run the xgboost:
# Converting datasets to xgb.DMatrix
dtrainlm = xgb.DMatrix(data.matrix(trainlm[,features]),
label = data.matrix(trainlm$Facies))
dtestlm = xgb.DMatrix(data.matrix(testlm[,features]),
label = data.matrix(testlm$Facies))
# Calculating number of classes in the data
numberOfClasses <- length(unique(dataf$Facies))
# Setting up parameters
param <- list(max_depth = 6,
eta = .5,
silent = 1,
nthread = 2,
#subsample = 0.5,
objective = "multi:softmax",
eval_metric = "merror",
"num_class" = numberOfClasses)
# Defining training and evaluation sets
watchlist <- list(train = dtrainlm,
eval = dtestlm)
# Generating first model using Gradient Boosting Trees
modellm = xgb.train(param,
dtrainlm,
nrounds = 75,
watchlist)
## [1] train-merror:0.290608 eval-merror:0.648107
## [2] train-merror:0.259392 eval-merror:0.596882
## [3] train-merror:0.234254 eval-merror:0.583519
## [4] train-merror:0.220994 eval-merror:0.574610
## [5] train-merror:0.200829 eval-merror:0.565702
## [6] train-merror:0.183978 eval-merror:0.556793
## [7] train-merror:0.170166 eval-merror:0.543430
## [8] train-merror:0.157735 eval-merror:0.556793
## [9] train-merror:0.147790 eval-merror:0.543430
## [10] train-merror:0.139503 eval-merror:0.536748
## [11] train-merror:0.124586 eval-merror:0.543430
## [12] train-merror:0.116022 eval-merror:0.543430
## [13] train-merror:0.106077 eval-merror:0.541203
## [14] train-merror:0.092265 eval-merror:0.543430
## [15] train-merror:0.083978 eval-merror:0.532294
## [16] train-merror:0.077348 eval-merror:0.534521
## [17] train-merror:0.069337 eval-merror:0.541203
## [18] train-merror:0.062155 eval-merror:0.536748
## [19] train-merror:0.058011 eval-merror:0.536748
## [20] train-merror:0.051105 eval-merror:0.545657
## [21] train-merror:0.043094 eval-merror:0.543430
## [22] train-merror:0.040055 eval-merror:0.527840
## [23] train-merror:0.034807 eval-merror:0.530067
## [24] train-merror:0.032873 eval-merror:0.530067
## [25] train-merror:0.030387 eval-merror:0.530067
## [26] train-merror:0.026519 eval-merror:0.532294
## [27] train-merror:0.021271 eval-merror:0.532294
## [28] train-merror:0.018232 eval-merror:0.532294
## [29] train-merror:0.015746 eval-merror:0.523385
## [30] train-merror:0.015470 eval-merror:0.523385
## [31] train-merror:0.013812 eval-merror:0.525612
## [32] train-merror:0.011602 eval-merror:0.532294
## [33] train-merror:0.010497 eval-merror:0.527840
## [34] train-merror:0.008840 eval-merror:0.525612
## [35] train-merror:0.008011 eval-merror:0.530067
## [36] train-merror:0.006077 eval-merror:0.516704
## [37] train-merror:0.005249 eval-merror:0.516704
## [38] train-merror:0.005249 eval-merror:0.512249
## [39] train-merror:0.005249 eval-merror:0.503341
## [40] train-merror:0.004972 eval-merror:0.505568
## [41] train-merror:0.004696 eval-merror:0.501114
## [42] train-merror:0.004420 eval-merror:0.516704
## [43] train-merror:0.004144 eval-merror:0.510022
## [44] train-merror:0.003591 eval-merror:0.501114
## [45] train-merror:0.003039 eval-merror:0.501114
## [46] train-merror:0.002762 eval-merror:0.501114
## [47] train-merror:0.003039 eval-merror:0.501114
## [48] train-merror:0.002762 eval-merror:0.507795
## [49] train-merror:0.002762 eval-merror:0.512249
## [50] train-merror:0.002486 eval-merror:0.512249
## [51] train-merror:0.002486 eval-merror:0.510022
## [52] train-merror:0.002486 eval-merror:0.512249
## [53] train-merror:0.002486 eval-merror:0.507795
## [54] train-merror:0.002486 eval-merror:0.507795
## [55] train-merror:0.002486 eval-merror:0.503341
## [56] train-merror:0.002486 eval-merror:0.503341
## [57] train-merror:0.002486 eval-merror:0.503341
## [58] train-merror:0.002486 eval-merror:0.505568
## [59] train-merror:0.002486 eval-merror:0.510022
## [60] train-merror:0.002486 eval-merror:0.507795
## [61] train-merror:0.002486 eval-merror:0.512249
## [62] train-merror:0.002486 eval-merror:0.516704
## [63] train-merror:0.002486 eval-merror:0.514477
## [64] train-merror:0.002486 eval-merror:0.505568
## [65] train-merror:0.002486 eval-merror:0.505568
## [66] train-merror:0.002486 eval-merror:0.507795
## [67] train-merror:0.002486 eval-merror:0.507795
## [68] train-merror:0.002486 eval-merror:0.507795
## [69] train-merror:0.002486 eval-merror:0.512249
## [70] train-merror:0.002486 eval-merror:0.505568
## [71] train-merror:0.002486 eval-merror:0.510022
## [72] train-merror:0.002486 eval-merror:0.510022
## [73] train-merror:0.002486 eval-merror:0.507795
## [74] train-merror:0.002486 eval-merror:0.503341
## [75] train-merror:0.002486 eval-merror:0.498886
# Computing prediction estimations for the test data
testlm_pred = predict(modellm,
newdata = data.matrix(testlm[,features]))
trainlm_pred = predict(modellm,
newdata = data.matrix(trainlm[,features]))
testlm$Predictions = testlm_pred
# confusion matrix of test set
xtablm = createConfusionMatrix(testlm$Facies + 1,
testlm$Predictions + 1)
xtablm = t(xtablm)
rownames(xtablm) <- facies_names[1:8]
colnames(xtablm) <- facies_names[1:8]
plotConfusionMatrix(xtablm, relScale = T)
plotConfusionMatrix(xtab, relScale = T)
# Confucion matrix and statistical information
u <- union(testlm$Predictions + 1, testlm$Facies + 1)
t <- table(factor(testlm$Predictions + 1, u), factor(testlm$Facies + 1, u))
CMlm = confusionMatrix(t,
mode = "everything")
cat("Accuracy with imputed data:", CMlm$overall[1], '\n')
## Accuracy with imputed data: 0.5011136
cat("Previous accuracy:", CM$overall[1], '\n')
## Previous accuracy: 0.4743875
dtemp = testlm
dtemp$Predictions <- dtemp$Predictions + 1
dtemp$Facies <- dtemp$Facies + 1
plotwellPred(datap = dtemp, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", classy = "Predictions", fcolor = facies_colors,
fnames = facies_names)
The data imputation of the missing PE logs increased the process accuracy, but it is still too low. We need to work with feature augement. In others words, we need to use our data to create new features, by using some combination rule.
Let’s do some analysis of the data and do a pairs plot:
pm = ggpairs(datalm,
columns = match(c(well_logs), colnames(datalm)),
aes(colour = factor(Facies),
alpha = 0.3),
diag = list(continuous = "barDiag"),
upper = list(continuous = "density"),
lower = list(continuous = "points", combo = "dot_no_facet"),
legend = pright)
# Change color manually.
# Loop through each plot changing relevant scales
for(i in 1:pm$nrow) {
for(j in 1:pm$ncol){
pm[i,j] <- pm[i,j] +
scale_fill_manual(values=c(facies_colors)) +
scale_color_manual(values=c(facies_colors))
}
}
pm
We can see some “circular” behavior of the pairs plot and data classification (facies). It is more clear when analyzing the density plots. So, to see if we can improve the facies classification, let’s convert the “cardinal” coordinates of the pairs to polar coordinates, using the function cart2polwells from the support file Utils.R.
data_polar = cart2polwells(datalm,well_logs)
nofeatures = c("Facies","Formation", "Well.Name", "Depth")
features_pol = colnames(data_polar)
features_pol = features_pol[! features_pol %in% nofeatures]
Now, with the new coordinates, recreate the test and validation sets and then train the model:
# For the xgboost package, target must be from 0 to num_class - 1
temp <- data_polar
temp$Facies = temp$Facies - 1
# Splitting data
trainpol = temp[temp$Well.Name != testWell, ]
testpol = temp[temp$Well.Name == testWell, ]
# Converting datasets to xgb.DMatrix
dtrainpol = xgb.DMatrix(data.matrix(trainpol[,features_pol]),
label = data.matrix(trainpol$Facies))
dtestpol = xgb.DMatrix(data.matrix(testpol[,features_pol]),
label = data.matrix(testpol$Facies))
mdepth = round(length(features_pol)/2)
# Setting up parameters
param <- list(max_depth = 5,
eta = .3,
silent = 1,
nthread = 10,
objective = "multi:softmax",
eval_metric = "merror",
"num_class" = numberOfClasses)
# Defining training and evaluation sets
watchlist <- list(train = dtrainpol,
eval = dtestpol)
# Generating first model using Gradient Boosting Trees
modelpol = xgb.train(param,
dtrainpol,
nrounds = 47,
watchlist)
## [1] train-merror:0.301105 eval-merror:0.563474
## [2] train-merror:0.276796 eval-merror:0.534521
## [3] train-merror:0.252486 eval-merror:0.534521
## [4] train-merror:0.240884 eval-merror:0.523385
## [5] train-merror:0.233425 eval-merror:0.516704
## [6] train-merror:0.220442 eval-merror:0.514477
## [7] train-merror:0.216575 eval-merror:0.534521
## [8] train-merror:0.204696 eval-merror:0.518931
## [9] train-merror:0.195304 eval-merror:0.512249
## [10] train-merror:0.187845 eval-merror:0.507795
## [11] train-merror:0.181492 eval-merror:0.516704
## [12] train-merror:0.171547 eval-merror:0.510022
## [13] train-merror:0.164917 eval-merror:0.505568
## [14] train-merror:0.156630 eval-merror:0.501114
## [15] train-merror:0.152210 eval-merror:0.505568
## [16] train-merror:0.148066 eval-merror:0.505568
## [17] train-merror:0.145028 eval-merror:0.510022
## [18] train-merror:0.138122 eval-merror:0.501114
## [19] train-merror:0.130111 eval-merror:0.516704
## [20] train-merror:0.124862 eval-merror:0.512249
## [21] train-merror:0.120718 eval-merror:0.512249
## [22] train-merror:0.116575 eval-merror:0.518931
## [23] train-merror:0.113812 eval-merror:0.516704
## [24] train-merror:0.107459 eval-merror:0.516704
## [25] train-merror:0.099448 eval-merror:0.518931
## [26] train-merror:0.093923 eval-merror:0.516704
## [27] train-merror:0.087293 eval-merror:0.516704
## [28] train-merror:0.083149 eval-merror:0.514477
## [29] train-merror:0.076519 eval-merror:0.507795
## [30] train-merror:0.072099 eval-merror:0.512249
## [31] train-merror:0.069890 eval-merror:0.514477
## [32] train-merror:0.067956 eval-merror:0.514477
## [33] train-merror:0.064641 eval-merror:0.510022
## [34] train-merror:0.058564 eval-merror:0.507795
## [35] train-merror:0.056354 eval-merror:0.512249
## [36] train-merror:0.052762 eval-merror:0.512249
## [37] train-merror:0.049171 eval-merror:0.503341
## [38] train-merror:0.046961 eval-merror:0.496659
## [39] train-merror:0.043370 eval-merror:0.501114
## [40] train-merror:0.039779 eval-merror:0.496659
## [41] train-merror:0.036740 eval-merror:0.503341
## [42] train-merror:0.030663 eval-merror:0.503341
## [43] train-merror:0.027072 eval-merror:0.494432
## [44] train-merror:0.024586 eval-merror:0.492205
## [45] train-merror:0.022099 eval-merror:0.492205
## [46] train-merror:0.020442 eval-merror:0.489978
## [47] train-merror:0.019061 eval-merror:0.487751
Plot confusion matrix and evaluate accuracy:
# Computing prediction estimations for the test data
testpol_pred = predict(modelpol,
newdata = data.matrix(testpol[,features_pol]))
trainpol_pred = predict(modelpol,
newdata = data.matrix(trainpol[,features_pol]))
testpol$Predictions = testpol_pred
# confusion matrix of test set
xtabpol = createConfusionMatrix(testpol$Facies + 1,
testpol$Predictions + 1)
xtabpol = t(xtabpol)
rownames(xtabpol) <- facies_names[1:8]
colnames(xtabpol) <- facies_names
plotConfusionMatrix(xtablm, relScale = T)
plotConfusionMatrix(xtabpol, relScale = T)
# Confucion matrix an statistical information
u <- union(testpol$Predictions + 1, testpol$Facies + 1)
t <- table(factor(testpol$Predictions + 1, u), factor(testpol$Facies + 1, u))
# Confucion matrix and statistical information
CMpol = confusionMatrix(t,
mode = "everything")
cat("Accuracy (polar): ",CMpol$overall[1], "\n")
## Accuracy (polar): 0.5122494
cat("Accuracy (LM): ",CMlm$overall[1], "\n")
## Accuracy (LM): 0.5011136
cat("Accuracy (original):",CM$overall[1], "\n")
## Accuracy (original): 0.4743875
dtemp = testpol
dtemp$Predictions <- dtemp$Predictions + 1
dtemp$Facies <- dtemp$Facies + 1
plotwellPred(datap = dtemp, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", classy = "Predictions", fcolor = facies_colors,
fnames = facies_names)
With the data imputation and the polar coordinates changes, we could increase the precision of the model by around 3.5 percentual points. Now, others competitors used the feature gradient as new features. Let’s do the same by using the function features_gradient from the file Utils.R:
data_grad = features_gradient(data_polar, features_pol, wellname)
features_grad = colnames(data_grad)
features_grad = features_grad[! features_grad %in% nofeatures]
Now, new model training:
# For the xgboost package, target must be from 0 to num_class - 1
temp <- data_grad
temp$Facies = temp$Facies - 1
# Splitting data
traingrad = temp[temp$Well.Name != testWell, ]
testgrad = temp[temp$Well.Name == testWell, ]
# Converting datasets to xgb.DMatrix
dtraingrad = xgb.DMatrix(data.matrix(traingrad[,features_grad]),
label = data.matrix(traingrad$Facies))
dtestgrad = xgb.DMatrix(data.matrix(testgrad[,features_grad]),
label = data.matrix(testgrad$Facies))
mdepth = round(length(features_grad)/2)
# Setting up parameters
param <- list(max_depth = 7,
eta = .3,
silent = 1,
nthread = 2,
objective = "multi:softmax",
eval_metric = "merror",
"num_class" = numberOfClasses)
# Defining training and evaluation sets
watchlist <- list(train = dtraingrad,
eval = dtestgrad)
# Generating first model using Gradient Boosting Trees
modelgrad = xgb.train(param,
dtraingrad,
nrounds = 11,
watchlist)
## [1] train-merror:0.203591 eval-merror:0.516704
## [2] train-merror:0.158840 eval-merror:0.487751
## [3] train-merror:0.138122 eval-merror:0.478842
## [4] train-merror:0.118508 eval-merror:0.469933
## [5] train-merror:0.105525 eval-merror:0.485523
## [6] train-merror:0.090884 eval-merror:0.472160
## [7] train-merror:0.080939 eval-merror:0.472160
## [8] train-merror:0.067956 eval-merror:0.465479
## [9] train-merror:0.055525 eval-merror:0.465479
## [10] train-merror:0.048619 eval-merror:0.454343
## [11] train-merror:0.041160 eval-merror:0.445434
# Computing prediction estimations for the test data
testgrad_pred = predict(modelgrad,
newdata = data.matrix(testgrad[,features_grad]))
traingrad_pred = predict(modelgrad,
newdata = data.matrix(traingrad[,features_grad]))
testgrad$Predictions = testgrad_pred
# confusion matrix of test set
xtabgrad = createConfusionMatrix(testgrad$Facies + 1,
testgrad$Predictions + 1)
xtabgrad = t(xtabgrad)
rownames(xtabgrad) <- facies_names[1:8]
colnames(xtabgrad) <- facies_names[1:8]
plotConfusionMatrix(xtabgrad, relScale = T)
plotConfusionMatrix(xtabpol, relScale = T)
# Confucion matrix and statistical information
u <- union(testgrad$Predictions + 1, testgrad$Facies + 1)
t <- table(factor(testgrad$Predictions + 1, u), factor(testgrad$Facies + 1, u))
CMgrad = confusionMatrix(t,
mode = "everything")
cat("Accuracy (gradient):",CMgrad$overall[1], "\n")
## Accuracy (gradient): 0.5545657
cat("Accuracy (polar): ",CMpol$overall[1], "\n")
## Accuracy (polar): 0.5122494
cat("Accuracy (LM): ",CMlm$overall[1], "\n")
## Accuracy (LM): 0.5011136
cat("Accuracy (original):",CM$overall[1], "\n")
## Accuracy (original): 0.4743875
dtemp = testgrad
dtemp$Predictions <- dtemp$Predictions + 1
dtemp$Facies <- dtemp$Facies + 1
plotwellPred(datap = dtemp, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", classy = "Predictions", fcolor = facies_colors,
fnames = facies_names)
Creating the gradient over all the features showed to be an excellent decision, and the accuracy is increased. To deliver the final facies classification predictions, we assume that isolated facies inside a “block” is something rare (not impossible, though). So, we will “fix” is by applying a median filter.
From the package signal, we will use the function medfilt1 to filter isolated classified facies in an area, by replacing it with the median of the window.
testgrad2 = testgrad
testgrad2$Classy = testgrad_pred
for (w in wellname){
testgrad2[which(testgrad2$Well.Name %in% w), "Classy"] =
runmed(testgrad2[which(testgrad2$Well.Name %in% w), "Classy"],
k = 5, endrule = c("keep"))
}
u = union(testgrad2$Facies, testgrad2$Predictions)
# confusion matrix of test set
xtabgrad2 = createConfusionMatrix(testgrad2$Facies + 1,
testgrad2$Classy + 1)
xtabgrad2 = t(xtabgrad2)
rownames(xtabgrad2) <- facies_names[1:8]
colnames(xtabgrad2) <- facies_names[1:8]
plotConfusionMatrix(xtabgrad, relScale = T)
plotConfusionMatrix(xtabgrad2, relScale = T)
u <- union(testgrad2$Classy + 1, testgrad2$Facies + 1)
t <- table(factor(testgrad2$Classy + 1, u), factor(testgrad2$Facies + 1, u))
# Confucion matrix and statistical information
CMgrad2 = confusionMatrix(t,
mode = "everything")
cat("Accuracy (gradient):",CMgrad2$overall[1], "\n")
## Accuracy (gradient): 0.5701559
Plotting the final predictions:
dtemp = testgrad2
dtemp$Classy <- dtemp$Classy + 1
dtemp$Facies <- dtemp$Facies + 1
plotwellPred(datap = dtemp, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", classy = "Classy", fcolor = facies_colors,
fnames = facies_names)
Chacking features importance:
importance <- xgb.importance(feature_names = features_grad, model = modelgrad)
xgb.ggplot.importance(importance, top_n = 10, measure = "Gain", rel_to_first = T, n_clusters = 5)
The choice of the package xgboost led us to a very accurate facies classification, even considering the low amount of data to train the gradient boosting trees models.
Now, let’s try to run a clustering algorithm to create new features. Clustering algorithms, for this purpose, are considered “unsupervised learning”, as they don’t require to be trained with the correct answer. Those algorithms just find similarities between the data intupts and separate them in clusters.
The next cell will replace missing values of the data by 0 and, after, use the kmeans to create 5 clusters in the data.
set.seed(101)
# Features not used for the clustering
no_clust = c("RELPOS", "NM_M", "RELPOS_grad", "NM_M_grad")
features_cluster = features_grad[! features_grad %in% no_clust]
# Replacing missing values by 0
for (x in features_cluster) {
data_grad[is.na(data_grad[,x]), x] <- 0
}
#features_cluster = features_grad[features_grad != no_clust]
Cluster = kmeans(as.matrix(data_grad[,features_cluster]), 6)
data_grad$cluster = as.factor(Cluster$cluster)
head(data_grad)
Training the classifier:
# For the xgboost package, target must be from 0 to num_class - 1
temp <- data_grad
temp$Facies = temp$Facies - 1
# Splitting data
trainclust = temp[temp$Well.Name != testWell, ]
testclust = temp[temp$Well.Name == testWell, ]
# New set of features
features_clust = c(features_grad,"cluster")
# Converting datasets to xgb.DMatrix
dtrainclust = xgb.DMatrix(data.matrix(trainclust[,features_clust]),
label = data.matrix(trainclust$Facies))
dtestclust = xgb.DMatrix(data.matrix(testclust[,features_clust]),
label = data.matrix(testclust$Facies))
mdepth = round(length(features_clust)/2)
# Setting up parameters
param <- list(max_depth = 8,
eta = .3,
silent = 1,
nthread = 2,
objective = "multi:softmax",
eval_metric = "merror",
"num_class" = numberOfClasses)
# Defining training and evaluation sets
watchlist <- list(train = dtrainclust,
eval = dtestclust)
# Generating first model using Gradient Boosting Trees
modelclust = xgb.train(param,
dtrainclust,
nrounds = 27,
watchlist)
## [1] train-merror:0.172928 eval-merror:0.525612
## [2] train-merror:0.129834 eval-merror:0.492205
## [3] train-merror:0.099171 eval-merror:0.487751
## [4] train-merror:0.079558 eval-merror:0.469933
## [5] train-merror:0.066575 eval-merror:0.487751
## [6] train-merror:0.056630 eval-merror:0.485523
## [7] train-merror:0.041713 eval-merror:0.485523
## [8] train-merror:0.031492 eval-merror:0.483296
## [9] train-merror:0.022928 eval-merror:0.481069
## [10] train-merror:0.017956 eval-merror:0.474388
## [11] train-merror:0.014088 eval-merror:0.472160
## [12] train-merror:0.009945 eval-merror:0.465479
## [13] train-merror:0.008287 eval-merror:0.463252
## [14] train-merror:0.005801 eval-merror:0.463252
## [15] train-merror:0.004144 eval-merror:0.463252
## [16] train-merror:0.002762 eval-merror:0.463252
## [17] train-merror:0.002486 eval-merror:0.469933
## [18] train-merror:0.001934 eval-merror:0.465479
## [19] train-merror:0.001657 eval-merror:0.465479
## [20] train-merror:0.001657 eval-merror:0.463252
## [21] train-merror:0.001657 eval-merror:0.461024
## [22] train-merror:0.001657 eval-merror:0.456570
## [23] train-merror:0.001657 eval-merror:0.454343
## [24] train-merror:0.001657 eval-merror:0.452116
## [25] train-merror:0.001381 eval-merror:0.452116
## [26] train-merror:0.001105 eval-merror:0.454343
## [27] train-merror:0.001105 eval-merror:0.456570
# Computing prediction estimations for the test data
testclust_pred = predict(modelclust,
newdata = data.matrix(testclust[,features_clust]))
trainclust_pred = predict(modelclust,
newdata = data.matrix(trainclust[,features_clust]))
testclust$Predictions = testclust_pred
# confusion matrix of test set
xtabclust = createConfusionMatrix(testclust$Facies + 1,
testclust$Predictions + 1)
xtabclust = t(xtabclust)
rownames(xtabgrad) <- facies_names[1:8]
colnames(xtabgrad) <- facies_names[1:8]
plotConfusionMatrix(xtabgrad, relScale = T)
plotConfusionMatrix(xtabclust, relScale = T)
# Confucion matrix and statistical information
u <- union(testclust$Predictions + 1, testclust$Facies + 1)
t <- table(factor(testclust$Predictions + 1, u), factor(testclust$Facies + 1, u))
CMclust = confusionMatrix(t,
mode = "everything")
cat("Accuracy (cluster): ",CMclust$overall[1], "\n")
## Accuracy (cluster): 0.5434298
cat("Accuracy (gradient):",CMgrad2$overall[1], "\n")
## Accuracy (gradient): 0.5701559
cat("Accuracy (polar): ",CMpol$overall[1], "\n")
## Accuracy (polar): 0.5122494
cat("Accuracy (LM): ",CMlm$overall[1], "\n")
## Accuracy (LM): 0.5011136
cat("Accuracy (original):",CM$overall[1], "\n")
## Accuracy (original): 0.4743875
dtemp = testclust
dtemp$Predictions <- dtemp$Predictions + 1
dtemp$Facies <- dtemp$Facies + 1
plotwellPred(datap = dtemp, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", classy = "Predictions", fcolor = facies_colors,
fnames = facies_names)
testclust2 = testclust
testclust2$Classy = testclust_pred
for (w in wellname){
testclust2[which(testclust2$Well.Name %in% w), "Classy"] =
runmed(testclust2[which(testclust2$Well.Name %in% w), "Classy"],
k = 5, endrule = c("keep"))
}
u = union(testclust2$Facies, testclust2$Predictions)
# confusion matrix of test set
xtabclust2 = createConfusionMatrix(testclust2$Facies + 1,
testclust2$Classy + 1)
xtabclust2 = t(xtabclust2)
rownames(xtabclust2) <- facies_names[1:8]
colnames(xtabclust2) <- facies_names[1:8]
plotConfusionMatrix(xtabclust, relScale = T)
plotConfusionMatrix(xtabclust2, relScale = T)
u <- union(testclust2$Classy + 1, testclust2$Facies + 1)
t <- table(factor(testclust2$Classy + 1, u), factor(testclust2$Facies + 1, u))
# Confucion matrix and statistical information
CMclust2 = confusionMatrix(t,
mode = "everything")
cat("Accuracy (cluster): ",CMclust2$overall[1], "\n")
## Accuracy (cluster): 0.5790646
cat("Accuracy (gradient):",CMgrad2$overall[1], "\n")
## Accuracy (gradient): 0.5701559
cat("Accuracy (polar): ",CMpol$overall[1], "\n")
## Accuracy (polar): 0.5122494
cat("Accuracy (LM): ",CMlm$overall[1], "\n")
## Accuracy (LM): 0.5011136
cat("Accuracy (original):",CM$overall[1], "\n")
## Accuracy (original): 0.4743875
dtemp = testclust2
dtemp$Classy <- dtemp$Classy + 1
dtemp$Facies <- dtemp$Facies + 1
plotwellPred(datap = dtemp, wname = testWell, dz = "Depth", wlogs = well_logs,
facies = "Facies", classy = "Classy", fcolor = facies_colors,
fnames = facies_names)
importance <- xgb.importance(feature_names = features_clust, model = modelclust)
xgb.ggplot.importance(importance, top_n = 10, measure = "Gain", rel_to_first = T, n_clusters = 5)